# ==============================================================================
# file name: 02-summarize-survey-visits-LU.sql
# date:	Dec 14, 2022
# author: Bernhard Clemm 
# purpose: match visit-level data to host lists & summarize survey taking (Lucid)
# THIS SCRIPT REQUIRES ACCESS TO THE RAW DATA AND SERVES FOR REFERENCE ONLY
# ==============================================================================

# Pre-process host lists =======================================================

# To create a final list of hosts, we 
# ... take the union of hosts_500_FB_coded.csv, hosts_500_LU_coded.csv and 
# ... hosts_500_YG_coded.csv (see 02-summarize-survey-visits-YG.R)
# ... re-upload it as hosts_500_all
# ... and create the union of hosts_bevec_FB, hosts_survey_FB and hosts_500_all:

CREATE TABLE hosts_final AS
(SELECT url_host, 'bevec_report' AS method
FROM hosts_bevec_LUCID
UNION
SELECT url_host, 'survey_url' AS method
FROM hosts_survey_LUCID
UNION
SELECT url_host, method
FROM hosts_500_all)

# Match to visits ==============================================================

CREATE TABLE visits_professionals AS
SELECT *, 
-- cutoff at 300 seconds = 5 minutes; all durations > cutoff to be N
CASE 
WHEN DATE_DIFF('second', created_utc, created_utc_next) > 300 THEN NULL
ELSE DATE_DIFF('second', created_utc, created_utc_next) 
END AS duration_s_5_na
FROM
(SELECT v.id, v.person_id, v.wave, v.url_host, h.method,
CASE WHEN h.url_host IS NOT NULL THEN 1 ELSE 0 END as visit_survey,
-- Identify visits to Google, Amazon, Facebook, Youtube for comparison
-- regex approach: match URL hosts should 
--  (1) end on the pattern, e.g. pattern "google.com" should not match a host "google.co"
--  (2) either (a) begin with the pattern, or 
--  (b) have a "." before the pattern, e.g. "news.google.com" should match but not "fakegoogle.com"
CASE WHEN REGEXP_LIKE(v.url_host, '^google\.com$|\.google\.com$') THEN 1 ELSE 0 END as visit_google,
CASE WHEN REGEXP_LIKE(v.url_host, '^amazon\.com$|\.amazon\.com$') THEN 1 ELSE 0 END as visit_amazon,
CASE WHEN REGEXP_LIKE(v.url_host, '^youtube\.com$|\.youtube\.com$') THEN 1 ELSE 0 END as visit_youtube,
CASE WHEN REGEXP_LIKE(v.url_host, '^facebook\.com$|\.facebook\.com$') THEN 1 ELSE 0 END as visit_facebook, 
v.url, v.created_utc, date_format(v.created_utc,'%Y-%m-%d') created_date,
LEAD(v.created_utc) OVER(PARTITION BY v.person_id ORDER BY v.created_utc) created_utc_next
FROM visits_clean_url v
left join hosts_final h on v.url_host = h.url_host
where v.country = 'US')

# Person-wave-level summary ======================================================

SELECT person_id, wave,
COUNT() AS n_total,
SUM(duration_s_5_na) AS s_total_5_na,
COUNT(DISTINCT(created_date)) AS n_days_active,
COUNT(DISTINCT(CASE WHEN visit_survey = 1 THEN created_date ELSE NULL END)) AS n_survey_days_active,
SUM(CASE WHEN visit_survey = 1 THEN duration_s_5_na ELSE 0 END) AS s_survey_5_na,
SUM(visit_survey) AS n_survey,
SUM(CASE WHEN method = 'survey_url' THEN 1 ELSE 0 END) AS n_survey_url, 
SUM(CASE WHEN method = 'bevec_report' THEN 1 ELSE 0 END) AS n_bevec, 
SUM(CASE WHEN method = 'manual_labelling1' THEN 1 ELSE 0 END) AS n_manual_labelling1,
SUM(CASE WHEN method = 'manual_labelling2' THEN 1 ELSE 0 END) AS n_manual_labelling2,
SUM(visit_google) AS n_google,
SUM(visit_youtube) AS n_youtube,
SUM(visit_facebook) AS n_facebook,
SUM(visit_amazon) AS n_amazon,
SUM(CASE WHEN visit_google = 1 THEN duration_s_5_na ELSE 0 END) AS s_google_5_na,
SUM(CASE WHEN visit_youtube = 1 THEN duration_s_5_na ELSE 0 END) AS s_youtube_5_na,
SUM(CASE WHEN visit_facebook = 1 THEN duration_s_5_na ELSE 0 END) AS s_facebook_5_na,
SUM(CASE WHEN visit_amazon = 1 THEN duration_s_5_na ELSE 0 END) AS s_amazon_5_na
FROM visits_professionals
GROUP BY person_id, wave
ORDER BY person_id, wave 

-- exported as data/browsing_processed/people_visits_LU.csv

# Top-10 most popular survey hosts ==========================================================

SELECT url_host, COUNT() ct 
FROM visits_professionals 
WHERE visit_survey = 1 
GROUP BY url_host
ORDER BY ct DESC

-- exported as data/browsing_hosts/hosts_popular_LU.csv

